#!/usr/bin/env python
# -*- coding: UTF-8 -*-



import sys
import re

def formatUrl(url):
    if url.startswith("http://"):
        url=url[7:]
    if url.endswith("/"):
        url=url[:-1]
    return url

conversionList={}
url2id={}  #cid,sid, 配置表中ID 对应 url
with open(sys.argv[1]) as f:
    for line in f:
        t = line.strip().split("\001")
        if len(t) == 4:
            cpid,cid,sid,page = t #aid 表中自动给的id

            for url in page.split("!_@_!",-1):
                cid=cid.strip()
                sid=sid.strip()
                url=formatUrl(url.strip())
                if not url2id.has_key(url):
                    url2id[url]={}
                url2id[url][cid+sid]=[cid,sid,cpid,page]

for line in sys.stdin:
    t=line.strip().split("\t")
    if len(t)==4:
        cid,sid,cookie,eps=t
        for ep in eps.strip().split(","):
            for url in url2id.keys():
                ep=formatUrl(ep.strip())

                try:
                    m=re.match(url+"$",ep)
                    if m:
                        k=cid.strip()+sid.strip()
                        if k in url2id[url]:
                            print "\t".join(url2id[url][k])+"\t"+cookie
                            # break
                except:
                    pass

